Course Name: Intro to Machine Learning
Faculty: Faculty of Engineering, Tel-Aviv University
Name: Denise Bishevsky and Elinor Bengayev
Date: 18.7.2021
Intro: This project is the final project in the "Introduction to Machine Learning" course at the Faculty of Engineering, as a part of the BS.c degree 'Sciences for High-Tech' in Tel Aviv University. In this project we received a large data set file containing various features on bookings made in hotels and labels whether the booking was canceled or not. The aim of the project is to predict using machine learning models whether a particular order may be canceled or not.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from numpy import interp
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.express as px
import scipy.stats as stats
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import validation_curve
from sklearn.metrics import roc_auc_score
from scipy.stats import chi2_contingency
df = pd.read_csv("feature_data.csv")
label = pd.read_csv("label_data.csv")
X_test = pd.read_csv("feature_data_test.csv")
X_test_Order_ID = X_test["Unnamed: 0"] #for exporting purpose later
#Check how much data df contains:
df.shape
df.head()
df.info()
#Check columns for null values by percentage
df.isnull().mean()*100
Let's overview the statistical summary of the numerical features:
df.describe().T
df["Unnamed: 0"].value_counts()
df = df.rename(columns = {'Unnamed: 0' : 'order_id'})
label = label.rename(columns = {'Unnamed: 0' : 'order_id'})
X_test = X_test.rename(columns = {'Unnamed: 0' : 'order_id'})
#Now We would like to add the cancletion data and view it with our dataset
df = pd.merge(df, label, how='left', on='order_id')
numerical_features = df.select_dtypes(include=np.number)
numerical_features.pop('order_id')
print("The numerical features are: \n", numerical_features.columns)
numerical_features.hist(figsize=(20,20), color = '#6d2a55',bins=30)
plt.tight_layout()
plt.show()
From observation of this histogram we can notice some things:
Many histograms are tail heavy. This may make it a bit harder for some algorithms to detect patterns.
numerical_features['order_year'].value_counts()
numerical_features.pop('order_year');
## Reorder the months for better visualization
Month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
# Check the total demand per month between 2015-2017
plt.figure(figsize=(15,15))
sns.countplot(df['order_month'], palette='rocket', order = Month)
plt.title('Count of orders per month in total', weight='bold')
plt.xticks(rotation = 80)
plt.xlabel('Order Month', fontsize=30)
plt.show()
ax = df['cancelation'].value_counts().plot(kind='bar', figsize=(10,7),
color="#fbb4ae", fontsize=13);
ax.set_alpha(0.8)
ax.set_title("Canceled vs Checked-In in Total", fontsize=18)
plt.xticks([0,1],['Checked-In', 'Canceled'])
ax.set_ylabel("Count", fontsize=18);
# create a list to collect the plt.patches data
totals = []
# find the values and append to list
for i in ax.patches:
totals.append(i.get_height())
# set individual bar lables using above list
total = sum(totals)
# set individual bar lables using above list
for i in ax.patches:
ax.text(i.get_x()+.15, i.get_height()+.5, \
str(round((i.get_height()/total)*100, 2))+'%', fontsize=15,
color='dimgrey', ha='left', va='bottom')
# Display `ADR` vs `Booking Cancellation Status` by month
df_eda = df.copy()
df_eda['adr'] = df_eda['adr'].astype(float)
plt.figure(figsize=(30,15))
sns.barplot(x='order_month', y='adr', hue='cancelation', dodge=True, palette= sns.color_palette(['#fbb4ae', '#84ffe3']), data=df_eda, order=Month)
sns.set(font_scale = 2.2)
plt.title('Arrival Month vs ADR vs Booking Cancellation Status', weight='bold', size=40)
plt.xlabel('Arrival Month', fontsize=50)
plt.ylabel('ADR', fontsize=50)
August is the busiest month both in terms of number of arrivals and in terms of the number of bookings made.
Notice that in the months of July, August and September, cancelled bookings have higher number of ADR than those that weren’t cancelled.
numerical_features.describe().T
outliers = ['time_until_order', 'adults', 'children', 'babies', 'prev_canceled', 'prev_not_canceled', 'changes', 'adr', 'anon_feat_0', 'anon_feat_1', 'anon_feat_4', 'anon_feat_5', 'anon_feat_6', 'anon_feat_11']
n = 1
plt.figure(figsize=(30,30))
for column in outliers:
plt.subplot(4,4,n)
n = n+1
sns.boxplot(df[column], color = '#6d2a55')
plt.tight_layout()
fig = plt.figure(figsize=(60,40))
mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(40, 30))
ax = sns.heatmap(df.corr(),annot=True,fmt='.2f',cmap='vlag_r', mask=mask, vmin=-1, vmax= 1,center= 0, square=True,linewidths=2, cbar_kws={"shrink": .5})
sns.set(font_scale = 3)
plt.title("Correlation Matrix",size=30, weight='bold')
- time_until_order and anon_feat_11
- cancelation and anon_feat_13
- anon_feat_9 and order_id
- anon_feat_9 and agent
- anon_feat_4 and anon_feat_5
categorical_features = df.select_dtypes(include=np.object)
categorical_features['order_year'] = df['order_year']
print("The categorical features are: \n", categorical_features.columns)
df.describe(include=np.object)
A view on the categorical distribution and the top 10 attributes for every feature:
fig = plt.figure(figsize = (50,50))
for i, category in enumerate(categorical_features):
ax = fig.add_subplot(4,2,i+1)
ax = df[category].value_counts().head(10).plot.barh(color = '#6d2a55')
ax.set_title(category)
plt.figure(figsize=(30,10))
sns.countplot(x=df_eda["order_type"], hue=df_eda["cancelation"], palette='rocket')
plt.title("Cancelation Rate by Order Type", size=30, weight='bold')
plt.xlabel("Order Type", size=30)
plt.ylabel("Count", size=30)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(["not canceled", "canceled"], loc='upper right',fontsize=20)
plt.tight_layout()
plt.show()
We can see that:
## Count how many visitors from each country made booking and how many canceled it
total_booking = df_eda['country'].value_counts()
data_canceled = df_eda.loc[df_eda['cancelation'] == True, 'country'].value_counts()
cancelation_ratio = data_canceled.divide(total_booking, fill_value=0)
## Use dict to cast the list into a dictionary
data_r = dict(
autocolorscale=False,
colorscale = 'Pinkyl',
type = 'choropleth',
locations = cancelation_ratio.index,
z = cancelation_ratio
)
## Create a layout object
layout_r = dict(
title = 'The ratio between the number of bookings and the number of cancellations in each country',
geo = dict(
showframe = False,
projection = dict(type = 'orthographic')
)
)
## Display of the World Visitors Choromap
fig_r = go.Figure(data = [data_r],layout = layout_r)
fig_r.update_geos( resolution=50,
showocean=True, oceancolor="LightBlue",
)
iplot(fig_r)
#check for missing values
df.isnull().mean()*100
df = df.drop(['anon_feat_13', 'company'],axis=1)
numerical_features = numerical_features.drop(['anon_feat_13', 'company'], axis=1)
df = df.drop('order_month',axis=1)
categorical_features = categorical_features.drop(['order_month'],axis=1)
X, y = df.drop(columns = 'cancelation'), df.cancelation
# Split to train and validation data sets, random state is set so we could produce the same data sets
X_train, X_validation, y_train, y_validation = train_test_split(X, y,
test_size = 0.2,
random_state = 42)
# A copy for the pre_processing_func (a funtction of pre_processing for the validaition and test sets)
X_train_copy = X_train.copy()
y_train.value_counts()/len(y_train)
y_validation.value_counts()/len(y_validation)
median_fill_features = df[['adr','anon_feat_0','anon_feat_5','anon_feat_6','children','changes',
'time_until_order','anon_feat_10','anon_feat_11']]
mode_fill_features = df[['anon_feat_7','anon_feat_9','agent','country','deposit_type','customer_type']]
for col in median_fill_features:
med = X_train[col].median()
X_train[col] = X_train[col].fillna(med)
for col in mode_fill_features:
mode = X_train[col].mode()[0]
X_train[col] = X_train[col].fillna(mode)
#check for missing values
X_train.isnull().mean()*100
Replacing undefined values we saw earlier in order_type and acquisition_channel to mode values.
mode = X_train['order_type'].mode()[0]
X_train['order_type'] = X_train['order_type'].apply(lambda x: mode if x == 'Undefined' else x)
mode = X_train['acquisition_channel'].mode()[0]
X_train['acquisition_channel'] = X_train['acquisition_channel'].apply(lambda x: mode if x == 'Undefined' else x)
# # adding the cancelation feature so the removal will be from both date sets respectively.
# X_train['cancelation'] = y_train
# for col in outliers:
# mean = X_train[col].mean()
# std = X_train[col].std(ddof = 0)
# z_scores_train = (X_train[col] - mean)/(std)
# X_train = X_train[z_scores_train < 5]
# # returning to the original splited data sets
# X_train, y_train = X_train.drop(columns = 'cancelation'), X_train.cancelation
# X_train.shape
X_train['order_week'] = X_train['order_week'].apply(lambda x: x[5:])
X_train['order_week'] = pd.to_numeric(X_train['order_week'])
def order_week_to_season(row):
if (row['order_week'] > 0 and row['order_week'] <= 9) or (row['order_week'] > 48 and row['order_week'] <= 53):
return 'Winter'
if (row['order_week'] > 9 and row['order_week'] <= 21):
return 'Spring'
if (row['order_week'] > 21 and row['order_week'] <= 34):
return 'Summer'
if (row['order_week'] > 34 and row['order_week'] <= 48):
return 'Fall'
X_train['season'] = X_train.apply(lambda row : order_week_to_season(row), axis=1)
time_in_month = pd.cut(X_train["order_day_of_month"],
bins=[0, 10, 20 ,31],
labels=['Begining', "Middle", "End"])
X_train['time_in_month'] = time_in_month
top_ten_countries = pd.value_counts(X_train['country']).iloc[:10].index
X_train['is_in_top_ten_countries'] = X_train.country.isin(top_ten_countries) == 1
X_train['num_of_guests'] = X_train['adults'] + X_train['children'] + X_train['babies']
X_train.loc[:, 'cancellation_rate'] = (X_train['prev_canceled'] / (X_train['prev_canceled'] + X_train['prev_not_canceled'])).copy()
X_train.loc[:, 'cancellation_rate'].fillna(-1, inplace=True)
X_train.head()
#convert anon_feat_12 from bool to int
X_train["anon_feat_12"] = X_train["anon_feat_12"].astype(int)
#adding the new features
numerical_features['order_week'] = X_train['order_week']
numerical_features['anon_feat_12'] = X_train['anon_feat_12']
numerical_features['num_of_guests'] = X_train['num_of_guests']
numerical_features['cancellation_rate'] = X_train['cancellation_rate']
numerical_features['is_in_top_ten_countries'] = X_train['is_in_top_ten_countries']
def chi_square_of_df_cols(df, col1, col2):
df_col1, df_col2 = df[col1], df[col2]
contigency = pd.crosstab(df_col1, df_col2)
c, p, dof, expected = chi2_contingency(contigency)
if (p < 0.05):
print('p value is',p,'meaning that we reject the null hypothesis at 95% level of confidence.')
print('the features',col1,'and',col2,'are dependent, thus we can remove one of them.\n')
else:
print('p value is',p,'meaning that we do not reject the null hypothesis at 95% level of confidence.')
print('the features',col1,'and',col2,'are independent, thus we can remove one of them.\n')
chi_square_of_df_cols(X_train, 'order_type', 'acquisition_channel')
# X, y = df.drop(columns = 'cancelation'), df.cancelation
# X.corrwith(y).sort_values(ascending=False)
# top_ten_corolation = ['deposit_type','anon_feat_11','prev_canceled','time_until_order','anon_feat_7',
# 'anon_feat_8','anon_feat_10','anon_feat_10','anon_feat_5',
# 'acquisition_channel','changes','anon_feat_9']
# X = X[top_ten_corolation]
numerical_features.corrwith(y_train).abs().sort_values(ascending=False)
The following features have the lowest correlation with the label so we decided to drop them.
The following features have different reasons for removal as described:
dropped_features = ['adults','babies','country','order_id','children','order_week','order_day_of_month','acquisition_channel','prev_canceled','prev_not_canceled','anon_feat_0','anon_feat_12']
numerical_features = numerical_features.drop(['adults','babies','children','order_week','order_day_of_month','anon_feat_12',
'prev_canceled','prev_not_canceled','anon_feat_0'], axis = 1)
def drop_features(data,dropped_features):
data = data.drop(dropped_features ,axis=1)
return data
X_train = drop_features(X_train, dropped_features)
X_train.shape
#previous attmept of using label encoding on the categorical features
# for col in categorical_features:
# label_encoder = preprocessing.LabelEncoder()
# X_train[col] = label_encoder.fit_transform(X_train[col])
# updating the catecorical features after changes
categorical_features = ['season', 'order_type', 'deposit_type', 'customer_type',
'order_year','time_in_month']
X_train = pd.get_dummies(data=X_train, columns=categorical_features)
X_train.info()
def preprocessing_func (trained_data, data, y = pd.DataFrame()):
#### Fill NA
# We will do it all over all the feauters because we can't know what features have missing values in the validation and test data.
median_fill_features = df[['adr', 'adults', 'time_until_order', 'children', 'babies', 'anon_feat_0', 'anon_feat_1',
'anon_feat_10', 'anon_feat_11', 'anon_feat_2', 'anon_feat_4', 'anon_feat_5', 'anon_feat_6',
'changes', 'prev_canceled', 'prev_not_canceled']]
mode_fill_features = df[['agent', 'anon_feat_3', 'anon_feat_7', 'anon_feat_8', 'anon_feat_9','anon_feat_12', 'order_year',
'order_day_of_month','order_week','order_type','acquisition_channel',
'deposit_type','customer_type','order_year']]
for col in median_fill_features:
med = trained_data[col].median()
data[col] = data[col].fillna(med)
for col in mode_fill_features:
mode = trained_data[col].mode()[0]
data[col] = data[col].fillna(mode)
#### Handle undefined values
mode = trained_data['order_type'].mode()[0]
data['order_type'] = data['order_type'].apply(lambda x: mode if x == 'Undefined' else x)
mode = trained_data['acquisition_channel'].mode()[0]
data['acquisition_channel'] = data['acquisition_channel'].apply(lambda x: mode if x == 'Undefined' else x)
### Discretization
data['order_week'] = data['order_week'].apply(lambda x: x[5:])
data['order_week'] = pd.to_numeric(data['order_week'])
data['season'] = data.apply(lambda row : order_week_to_season(row), axis=1)
# changing the month days to periods
time_in_month = pd.cut(data["order_day_of_month"],
bins=[0, 10, 20 ,31],
labels=['Begining', "Middle", "End"])
data['time_in_month'] = time_in_month
# changing the country column to binary
top_ten_countries = pd.value_counts(trained_data['country']).iloc[:10].index
data['is_in_top_ten_countries'] = data.country.isin(top_ten_countries) == 1
# combining adults, children and babies
data['num_of_guests'] = data['adults'] + data['children'] + data['babies']
# calculate cancellation_rate
data.loc[:, 'cancellation_rate'] = (data['prev_canceled'] /
(data['prev_canceled'] + data['prev_not_canceled'])).copy()
data.loc[:, 'cancellation_rate'].fillna(-1, inplace=True)
# convert anon_feat_12 from bool to int
data["anon_feat_12"] = data["anon_feat_12"].astype(int)
# encode the categorical features
categorical_features = ['season', 'order_type', 'deposit_type', 'customer_type',
'order_year','time_in_month']
data = pd.get_dummies(data=data, columns=categorical_features)
# dropping the same features we dropped from the train set.
data = drop_features(data, dropped_features)
return data, y
X_validation, y_validation = preprocessing_func(X_train_copy, X_validation, y_validation)
X_validation.shape
X_train.shape
# box_cox_transform = ['adr','anon_feat_1','anon_feat_11','time_until_order']
# def make_positive(x, min_val):
# return x + min_val + 0.0000001
# for col in box_cox_transform:
# min_val = X_train[col].min()
# if (min_val >=0):
# min_val = 0
# else: min_val = abs(min_val)
# X_train[col] = X_train[col].apply(lambda x : make_positive(x, min_val))
# fitted_data, fitted_lambda = stats.boxcox(X_train[col])
# X_train[col] = fitted_data
cols = numerical_features.columns
scaler = StandardScaler()
X_train[cols] = scaler.fit_transform(X_train[cols])
pd.options.display.float_format = '{:.2f}'.format
X_train.describe()
# applying the same scale that was fitted by the train set.
def apply_scaling(data, scaler):
data[cols] = scaler.transform(data[cols])
return data
X_validation = apply_scaling(X_validation, scaler)
print("Train shape",X_train.shape,"\nValidation shape", X_validation.shape)
model = PCA()
model.fit_transform(X_train)
variance = model.explained_variance_
cum_var = np.cumsum(variance)/np.sum(variance)
plt.figure(figsize = (20,20))
plt.bar(range(1,43), cum_var*100, alpha = 0.5, align = 'center', label = 'cummulative variance', color = '#c4b1c4')
plt.legend()
plt.ylabel('Variance')
plt.xlabel('Principal components')
for x,y in zip(range(1,43),cum_var):
label = "{:.2f}".format(y)
plt.annotate(label, # this is the text
(x,y), # this is the point to label
textcoords = "offset points", # how to position the text
xytext = (0,10), # distance from text to points (x,y)
ha = 'center'
,fontsize=12)
plt.show()
pca = PCA(n_components = 28)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_train = np.array(X_train_pca)
y_train = np.array(y_train)
y_validation = np.array(y_validation)
X_train.shape
# applying the same PCA that was fitted by the train test.
def apply_pca(data, pca):
data = pca.transform(data)
data = np.array(data)
return data
X_validation = apply_pca(X_validation, pca)
X_validation.shape
y_validation.shape
X_test = X_test.drop(['anon_feat_13', 'company','order_month'],axis=1) #dropping the same features we dropped from the train set earlier
X_test, y_temp = preprocessing_func(X_train_copy, X_test)
X_test = apply_scaling(X_test, scaler)
X_test = apply_pca(X_test, pca)
X_test.shape
# lr_clf = LogisticRegression()
# param_grid = {'penalty': ['l1', 'l2'] ,
# 'C':[0.01,.09,1,5,10,25],
# 'solver':['lbfgs', 'liblinear'],
# 'max_iter' : [50,100,200],
# 'tol': [0.0001, 1e-4]
# }
# # Create grid search object
# clf = GridSearchCV(estimator = lr_clf, param_grid = param_grid, scoring='roc_auc' ,cv = 3, verbose = 2, n_jobs=-1)
# clf.fit(X_train,y_train)
# clf.best_params_
lgr_clf = LogisticRegression(
C = 0.01,
max_iter= 50,
penalty= 'l2',
solver= 'lbfgs',
tol= 0.0001,
)
lgr_model = lgr_clf.fit(X_train, y_train)
fpr, tpr, thresholds = roc_curve(y_validation, lgr_model.predict_proba(X_validation)[:,1])
print('AUC score for Logistic Regression Test:', auc(fpr, tpr))
# knn_clf = KNeighborsClassifier()
# param_grid = {'n_neighbors': [20, 50, 70, 60] ,
# 'weights':['uniform', 'distance'],
# 'metric':['euclidean','manhattan']
# }
# clf = GridSearchCV(knn_clf, param_grid = param_grid, scoring='roc_auc' ,cv = 3, verbose=True, n_jobs=-1)
# clf.fit(X_train,y_train)
# clf.best_params_
knn_clf = KNeighborsClassifier(
n_neighbors = 20,
weights = 'distance',
metric = 'manhattan'
)
knn_model = knn_clf.fit(X_train, y_train)
fpr, tpr, thresholds = roc_curve(y_validation, knn_model.predict_proba(X_validation)[:,1])
print('AUC score for KNN Test:', auc(fpr, tpr))
# dt_clf = DecisionTreeClassifier()
# param_grid= {
# 'criterion':['gini','entropy'],
# 'max_depth': [None],
# 'min_samples_split': [2, 4, 8],
# 'min_samples_leaf': [2, 3, 4]
# }
# clf = GridSearchCV(dt_clf, param_grid = param_grid, scoring='roc_auc' ,cv = 3, verbose=True, n_jobs=-1)
# clf.fit(X_train,y_train)
# clf.best_params_
dt_clf = DecisionTreeClassifier(
criterion = 'gini',
max_depth = None,
min_samples_leaf = 4,
min_samples_split = 2
)
dt_model = dt_clf.fit(X_train, y_train)
fpr, tpr, thresholds = roc_curve(y_validation, dt_clf.predict_proba(X_validation)[:,1])
print('AUC score for Decision Tree Test:', auc(fpr, tpr))
# rf_clf = RandomForestClassifier(random_state=6)
# param_grid = {
# 'bootstrap': [True],
# 'max_depth': [None],
# 'max_features': ['auto', 'sqrt', 'log2'],
# 'min_samples_leaf': [2, 3, 4],
# 'min_samples_split': [2, 4, 8],
# 'n_estimators': [150, 200, 250],
# 'criterion' :['gini','entropy']
# }
# grid_search_rf = GridSearchCV(estimator= rf_clf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 10)
# grid_search_rf.fit(X_train, y_train)
# grid_search_rf.best_params_
rf_clf = RandomForestClassifier(
bootstrap= True,
criterion='entropy',
max_depth= None,
max_features= 'auto',
min_samples_leaf= 2,
min_samples_split = 2,
n_estimators = 250
)
rf_model = rf_clf.fit(X_train, y_train)
fpr, tpr, thresholds = roc_curve(y_validation, rf_model.predict_proba(X_validation)[:,1])
print('AUC score for Random Forest Test:', auc(fpr, tpr))
def KfoldPlot(X, y, clf, k):
clf_name = str(clf)[:str(clf).find("(")]
kf = KFold(n_splits = k)
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []
fig = plt.figure(figsize=[20,20])
i = 1
for train, test in kf.split(X, y):
probas_ = clf.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
mean_tpr += interp(mean_fpr, fpr, tpr)
mean_tpr[0] = 0.0
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label = 'ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i = i + 1
plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
mean_tpr = mean_tpr / k
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, 'k--',
label='Mean ROC (AUC = %0.4f)' % mean_auc, lw=2)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve for {} model'.format(clf_name))
plt.legend(loc="lower right")
plt.show()
X = np.concatenate((X_train,X_validation))
y = np.concatenate((y_train,y_validation))
KfoldPlot(X, y, lgr_model, 10)
KfoldPlot(X, y, knn_model, 10)
KfoldPlot(X, y, dt_model, 10)
KfoldPlot(X, y, rf_model, 10)
plot_confusion_matrix(rf_model,X_validation,y_validation,cmap='rocket_r')
precision_score = 10792 / (10792 + 416) # TP / (TP + FP)
recall_score = 10792 / (10792 + 1042) # TP / (TP + FN)
print ("The precision score is", precision_score)
print("The recall score is", recall_score)
rf_model.fit(X, y)
y_prob = rf_model.predict_proba(X_test)[:, 1]
df_predicted = pd.DataFrame({' ': X_test_Order_ID, 'cancel_proba': y_prob})
df_predicted = df_predicted.set_index(' ') #just to fit it to the submission tamplate
df_predicted.to_csv('Submission_19.csv',index = True, float_format='%.8f')